knitr::opts_chunk$set(echo = TRUE, warning = FALSE, comment = FALSE, message = FALSE,
cache = FALSE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(xgboost)
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(Metrics)
library(ggpmisc)
## Loading required package: ggpp
##
## Attaching package: 'ggpp'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(ggthemes)
match_dir = 'data/matchups/'
model_dir = 'data/models/'
# Set random seed
set.seed(799)
The purpose of this script is to apply the xgboost
algorithm to Remote Sensing Imagery of Lake Yojoa in Honduras, to
estimate Yojoa water clarity. You can read more about this lake here.
In this very stringent approach, we subset the Secchi-Landsat matchups by Landsat flyover, resulting in no image-date crossover between the train, test, or validation datasets.
Note, we do not use the ‘weight’ column in this analysis, but create
it here for the next script algorithm development,
10_xgboost_higher_secchi.
#list all the files in the match directory
match = list.files(match_dir)
prepData = function(df) {
#make a rowid column
df_prep = df %>%
rowid_to_column() %>%
mutate(secchi = as.numeric(secchi)) %>% #there's one wonky value in here with two decimal points... dropping from this analysis
filter(!is.na(secchi))
}
#load the matchup files
threeDay = read.csv(file.path(match_dir, match[grepl('three', match) & !grepl('us', match)])) %>%
prepData(.) %>%
mutate(weight = secchi/max(secchi)*10)
fiveDay = read.csv(file.path(match_dir, match[grepl('five', match) & !grepl('us', match)])) %>%
prepData(.) %>%
mutate(weight = secchi/max(secchi)*10)
jemma = read.csv(file.path(match_dir, match[grepl('jemma', match)])) %>%
prepData(.) %>%
mutate(weight = secchi/max(secchi)*10)
We want to predict the secchi value in these datasets,
so let’s set the target as that variable:
## Identify our target (value is secchi)
target <- 'secchi'
60% of the data as the ‘train’ set, and split the remainder between ‘test’ and ‘val’.
system.index = unique(threeDay$system.index)
##Pull 60% as training data
train_3_uid <- tibble(system.index) %>%
sample_frac(.6)
test_3_uid <- tibble(system.index) %>%
anti_join(., train_3_uid) %>%
sample_frac(0.5)
val_3_uid <- tibble(system.index) %>%
anti_join(., train_3_uid) %>%
anti_join(., test_3_uid)
train_3 <- train_3_uid %>%
left_join(., threeDay) %>%
data.frame(.)
test_3 <- test_3_uid %>%
left_join(., threeDay)%>%
data.frame(.)
val_3 <- val_3_uid %>%
left_join(., threeDay)%>%
data.frame(.)
system.index = unique(fiveDay$system.index)
##Pull 60% as training data
train_5_uid <- tibble(system.index) %>%
sample_frac(.6)
test_5_uid <- tibble(system.index) %>%
anti_join(., train_5_uid) %>%
sample_frac(0.5)
val_5_uid <- tibble(system.index) %>%
anti_join(., train_5_uid) %>%
anti_join(., test_5_uid)
train_5 <- train_5_uid %>%
left_join(., fiveDay) %>%
data.frame(.)
test_5 <- test_5_uid %>%
left_join(., fiveDay)%>%
data.frame(.)
val_5 <- val_5_uid %>%
left_join(., fiveDay)%>%
data.frame(.)
The ‘local knowledge’ window creates a variable matchup window, where matchups can be within 7 days in all times of year except October and November, when the lake can have dramatic shifts in clarity in a short period of time.
system.index = unique(jemma$system.index)
##Pull 60% as training data
train_j_uid <- tibble(system.index) %>%
sample_frac(.6)
test_j_uid <- tibble(system.index) %>%
anti_join(., train_j_uid) %>%
sample_frac(0.5)
val_j_uid <- tibble(system.index) %>%
anti_join(., train_j_uid) %>%
anti_join(., test_j_uid)
train_j <- train_j_uid %>%
left_join(., jemma) %>%
data.frame(.)
test_j <- test_j_uid %>%
left_join(., jemma)%>%
data.frame(.)
val_j <- val_j_uid %>%
left_join(., jemma)%>%
data.frame(.)
To compare apples-to-apples between this and the next analysis, we’ll save these train/test sets.
save(train_3, test_3, val_3,
train_5, test_5, val_5,
train_j, test_j, val_j,
file = 'data/models/train_test_val_verystringent.RData')
Here, we indicate the features to be used in our models. We’ll use the visual bands and add in summaries of ERA5 met data. In our datasets, the 5-day met summaries have the suffix ‘_5’, etc. The first two models only inlcude recent weather summaries, but the final two models include 5 or 7 day weather summaries as well as the previous day’s weather.
band_met3_feats <- c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
'RN', 'BG', 'RB','GB',
'tot_sol_rad_KJpm2_3', 'max_temp_degK_3', 'mean_temp_degK_3', 'min_temp_degK_3',
'tot_precip_m_3', 'mean_wind_mps_3')
band_met5_feats <- c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
'RN', 'BG', 'RB','GB',
'tot_sol_rad_KJpm2_5', 'max_temp_degK_5', 'mean_temp_degK_5', 'min_temp_degK_5',
'tot_precip_m_5', 'mean_wind_mps_5')
band_met51_feats <- c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
'RN', 'BG', 'RB','GB',
'tot_sol_rad_KJpm2_5', 'max_temp_degK_5', 'mean_temp_degK_5', 'min_temp_degK_5',
'tot_precip_m_5', 'mean_wind_mps_5',
'solar_rad_KJpm2_prev', 'precip_m_prev','air_temp_degK_prev','wind_speed_mps_prev')
band_met71_feats <- c('med_Blue_corr', 'med_Green_corr', 'med_Red_corr', 'med_Nir_corr',
'RN', 'BG', 'RB','GB',
'tot_sol_rad_KJpm2_7', 'max_temp_degK_7', 'mean_temp_degK_7', 'min_temp_degK_7',
'tot_precip_m_7', 'mean_wind_mps_7',
'solar_rad_KJpm2_prev', 'precip_m_prev','air_temp_degK_prev','wind_speed_mps_prev')
## 3 day window, 3 days previous met
dtrain_3d_3m <- xgb.DMatrix(data = as.matrix(train_3[,band_met3_feats]),
label = train_3[,target])
dtest_3d_3m <- xgb.DMatrix(data = as.matrix(test_3[,band_met3_feats]),
label = test_3[,target])
dval_3d_3m <- xgb.DMatrix(data = as.matrix(val_3[,band_met3_feats]),
label = val_3[,target])
## 3 day window, 5 days previous met
dtrain_3d_5m <- xgb.DMatrix(data = as.matrix(train_3[,band_met5_feats]),
label = train_3[,target])
dtest_3d_5m <- xgb.DMatrix(data = as.matrix(test_3[,band_met5_feats]),
label = test_3[,target])
dval_3d_5m <- xgb.DMatrix(data = as.matrix(val_3[,band_met5_feats]),
label = val_3[,target])
## 3 day window, 5/1 days previous met
dtrain_3d_51m <- xgb.DMatrix(data = as.matrix(train_3[,band_met51_feats]),
label = train_3[,target])
dtest_3d_51m <- xgb.DMatrix(data = as.matrix(test_3[,band_met51_feats]),
label = test_3[,target])
dval_3d_51m <- xgb.DMatrix(data = as.matrix(val_3[,band_met51_feats]),
label = val_3[,target])
## 3 day window, 7/1 days previous met
dtrain_3d_71m <- xgb.DMatrix(data = as.matrix(train_3[,band_met71_feats]),
label = train_3[,target])
dtest_3d_71m <- xgb.DMatrix(data = as.matrix(test_3[,band_met71_feats]),
label = test_3[,target])
dval_3d_71m <- xgb.DMatrix(data = as.matrix(val_3[,band_met71_feats]),
label = val_3[,target])
## 5 day window, 3 days previous met
dtrain_5d_3m <- xgb.DMatrix(data = as.matrix(train_5[,band_met3_feats]),
label = train_5[,target])
dtest_5d_3m <- xgb.DMatrix(data = as.matrix(test_5[,band_met3_feats]),
label = test_5[,target])
dval_5d_3m <- xgb.DMatrix(data = as.matrix(val_5[,band_met3_feats]),
label = val_5[,target])
## 5 day window, 5 days previous met
dtrain_5d_5m <- xgb.DMatrix(data = as.matrix(train_5[,band_met5_feats]),
label = train_5[,target])
dtest_5d_5m <- xgb.DMatrix(data = as.matrix(test_5[,band_met5_feats]),
label = test_5[,target])
dval_5d_5m <- xgb.DMatrix(data = as.matrix(val_5[,band_met5_feats]),
label = val_5[,target])
## 5 day window, 5/1 days previous met
dtrain_5d_51m <- xgb.DMatrix(data = as.matrix(train_5[,band_met51_feats]),
label = train_5[,target])
dtest_5d_51m <- xgb.DMatrix(data = as.matrix(test_5[,band_met51_feats]),
label = test_5[,target])
dval_5d_51m <- xgb.DMatrix(data = as.matrix(val_5[,band_met51_feats]),
label = val_5[,target])
## 5 day window, 7/1 days previous met
dtrain_5d_71m <- xgb.DMatrix(data = as.matrix(train_5[,band_met71_feats]),
label = train_5[,target])
dtest_5d_71m <- xgb.DMatrix(data = as.matrix(test_5[,band_met71_feats]),
label = test_5[,target])
dval_5d_71m <- xgb.DMatrix(data = as.matrix(val_5[,band_met71_feats]),
label = val_5[,target])
## jemma window, 5 days previous met
dtrain_jd_5m <- xgb.DMatrix(data = as.matrix(train_j[,band_met5_feats]),
label = train_j[,target])
dtest_jd_5m <- xgb.DMatrix(data = as.matrix(test_j[,band_met5_feats]),
label = test_j[,target])
dval_jd_5m <- xgb.DMatrix(data = as.matrix(val_j[,band_met5_feats]),
label = val_j[,target])
## jemma window, 3 days previous met
dtrain_jd_3m <- xgb.DMatrix(data = as.matrix(train_j[,band_met3_feats]),
label = train_j[,target])
dtest_jd_3m <- xgb.DMatrix(data = as.matrix(test_j[,band_met3_feats]),
label = test_j[,target])
dval_jd_3m <- xgb.DMatrix(data = as.matrix(val_j[,band_met3_feats]),
label = val_j[,target])
## jemma window, 5/1 days previous met
dtrain_jd_51m <- xgb.DMatrix(data = as.matrix(train_j[,band_met51_feats]),
label = train_j[,target])
dtest_jd_51m <- xgb.DMatrix(data = as.matrix(test_j[,band_met51_feats]),
label = test_j[,target])
dval_jd_51m <- xgb.DMatrix(data = as.matrix(val_j[,band_met51_feats]),
label = val_j[,target])
## jemma window, 7/1 days previous met
dtrain_jd_71m <- xgb.DMatrix(data = as.matrix(train_j[,band_met71_feats]),
label = train_j[,target])
dtest_jd_71m <- xgb.DMatrix(data = as.matrix(test_j[,band_met71_feats]),
label = test_j[,target])
dval_jd_71m <- xgb.DMatrix(data = as.matrix(val_j[,band_met71_feats]),
label = val_j[,target])
This is an xgboost optimization method developed by Sam Sillen where you list many possible hyperparameter options and then create a matrix of all possible combinations - aka ‘grid search’ - and grab the top 20 performing combinations of hyperparameters by square error (our loss statistic).
# Hypertune xgboost parameters and save as 'best_params'
grid_train <- expand.grid(
max_depth= c(3,6,8),
subsample = c(.5,.8,1),
colsample_bytree= c(.5,.8,1),
eta = c(0.1, 0.3),
min_child_weight= c(3,5,7)
)
hypertune_xgboost = function(train,test, grid){
params <- list(booster = "gbtree", objective = 'reg:squarederror',
eta=grid$eta ,max_depth=grid$max_depth,
min_child_weight=grid$min_child_weight,
subsample=grid$subsample,
colsample_bytree=grid$colsample_bytree)
xgb.naive <- xgb.train(params = params, data = train, nrounds = 1000,
watchlist = list(train = train, val = test),
verbose = 0,
early_stopping_rounds = 20)
summary <- grid %>% mutate(val_loss = xgb.naive$best_score, best_message = xgb.naive$best_msg,
mod = list(xgb.naive))
return(summary)
}
Note, evaluation is turned off for these chunks as to not overwrite previous models and parameter tuning in next section.
## Hypertune xgboost 3 day window, 3 day met
xgboost_hypertune_3d_3m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_3d_3m,dtest_3d_3m,current)
})
mod_summary_3d_3m <- xgboost_hypertune_3d_3m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_3d_3m <- xgboost_hypertune_3d_3m[xgboost_hypertune_3d_3m$val_loss==min(xgboost_hypertune_3d_3m$val_loss),]
save(mod_summary_3d_3m,best_mod_3d_3m, file = 'data/models/paramsxg_ep_val_3d_3m.RData')
## Hypertune xgboost 3 day window, 5 day met
xgboost_hypertune_3d_5m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_3d_5m,dtest_3d_5m,current)
})
mod_summary_3d_5m <- xgboost_hypertune_3d_5m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_3d_5m <- xgboost_hypertune_3d_5m[xgboost_hypertune_3d_5m$val_loss==min(xgboost_hypertune_3d_5m$val_loss),]
save(mod_summary_3d_5m,best_mod_3d_5m, file = 'data/models/paramsxg_ep_val_3d_5m.RData')
## Hypertune xgboost 5 day window, 5/1 day met
xgboost_hypertune_3d_51m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_3d_51m,dtest_3d_51m,current)
})
mod_summary_3d_51m <- xgboost_hypertune_3d_51m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_3d_51m <- xgboost_hypertune_3d_51m[xgboost_hypertune_3d_51m$val_loss==min(xgboost_hypertune_3d_51m$val_loss),]
save(mod_summary_3d_51m,best_mod_3d_51m, file = 'data/models/paramsxg_ep_val_3d_51m.RData')
## Hypertune xgboost 5 day window, 7/1 day met
xgboost_hypertune_3d_71m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_3d_71m,dtest_3d_71m,current)
})
mod_summary_3d_71m <- xgboost_hypertune_3d_71m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_3d_71m <- xgboost_hypertune_3d_71m[xgboost_hypertune_3d_71m$val_loss==min(xgboost_hypertune_3d_71m$val_loss),]
save(mod_summary_3d_71m,best_mod_3d_71m, file = 'data/models/paramsxg_ep_val_3d_71m.RData')
## Hypertune xgboost 5 day window, 3 day met
xgboost_hypertune_5d_3m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_5d_3m,dtest_5d_3m,current)
})
mod_summary_5d_3m <- xgboost_hypertune_5d_3m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_5d_3m <- xgboost_hypertune_5d_3m[xgboost_hypertune_5d_3m$val_loss==min(xgboost_hypertune_5d_3m$val_loss),]
save(mod_summary_5d_3m,best_mod_5d_3m, file = 'data/models/paramsxg_ep_val_5d_3m.RData')
## Hypertune xgboost 5 day window, 5 day met
xgboost_hypertune_5d_5m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_5d_5m,dtest_5d_5m,current)
})
mod_summary_5d_5m <- xgboost_hypertune_5d_5m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_5d_5m <- xgboost_hypertune_5d_5m[xgboost_hypertune_5d_5m$val_loss==min(xgboost_hypertune_5d_5m$val_loss),]
save(mod_summary_5d_5m,best_mod_5d_5m, file = 'data/models/paramsxg_ep_val_5d_5m.RData')
## Hypertune xgboost 5 day window, 5/1 day met
xgboost_hypertune_5d_51m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_5d_51m,dtest_5d_51m,current)
})
mod_summary_5d_51m <- xgboost_hypertune_5d_51m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_5d_51m <- xgboost_hypertune_5d_51m[xgboost_hypertune_5d_51m$val_loss==min(xgboost_hypertune_5d_51m$val_loss),]
save(mod_summary_5d_51m,best_mod_5d_51m, file = 'data/models/paramsxg_ep_val_5d_51m.RData')
## Hypertune xgboost 5 day window, 7/1 day met
xgboost_hypertune_5d_71m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_5d_71m,dtest_5d_71m,current)
})
mod_summary_5d_71m <- xgboost_hypertune_5d_71m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_5d_71m <- xgboost_hypertune_5d_71m[xgboost_hypertune_5d_71m$val_loss==min(xgboost_hypertune_5d_71m$val_loss),]
save(mod_summary_5d_71m,best_mod_5d_71m, file = 'data/models/paramsxg_ep_val_5d_71m.RData')
## Hypertune xgboost jemma window, 3 day met
xgboost_hypertune_jd_3m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_jd_3m,dtest_jd_3m,current)
})
mod_summary_jd_3m <- xgboost_hypertune_jd_3m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_jd_3m <- xgboost_hypertune_jd_3m[xgboost_hypertune_jd_3m$val_loss==min(xgboost_hypertune_jd_3m$val_loss),]
save(mod_summary_jd_3m,best_mod_jd_3m, file = 'data/models/paramsxg_ep_val_jd_3m.RData')
## Hypertune xgboost jemma window, 5 day met
xgboost_hypertune_jd_5m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_jd_5m,dtest_jd_5m,current)
})
mod_summary_jd_5m <- xgboost_hypertune_jd_5m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_jd_5m <- xgboost_hypertune_jd_5m[xgboost_hypertune_jd_5m$val_loss==min(xgboost_hypertune_jd_5m$val_loss),]
save(mod_summary_jd_5m,best_mod_jd_5m, file = 'data/models/paramsxg_ep_val_jd_5m.RData')
## Hypertune xgboost jemma window, 5/1 day met
xgboost_hypertune_jd_51m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_jd_51m,dtest_jd_51m,current)
})
mod_summary_jd_51m <- xgboost_hypertune_jd_51m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_jd_51m <- xgboost_hypertune_jd_51m[xgboost_hypertune_jd_51m$val_loss==min(xgboost_hypertune_jd_51m$val_loss),]
save(mod_summary_jd_51m,best_mod_jd_51m, file = 'data/models/paramsxg_ep_val_jd_51m.RData')
## Hypertune xgboost jemma window, 7/1 day met
xgboost_hypertune_jd_71m <- grid_train %>%
pmap_dfr(function(...) {
current <- tibble(...)
hypertune_xgboost(dtrain_jd_71m,dtest_jd_71m,current)
})
mod_summary_jd_71m <- xgboost_hypertune_jd_71m %>%
arrange(val_loss) %>%
dplyr::slice(1:20)
best_mod_jd_71m <- xgboost_hypertune_jd_71m[xgboost_hypertune_jd_71m$val_loss==min(xgboost_hypertune_jd_71m$val_loss),]
save(mod_summary_jd_71m,best_mod_jd_71m, file = 'data/models/paramsxg_ep_val_jd_71m.RData')
load('data/models/paramsxg_ep_val_3d_5m.RData')
load('data/models/paramsxg_ep_val_3d_3m.RData')
load('data/models/paramsxg_ep_val_3d_51m.RData')
load('data/models/paramsxg_ep_val_3d_71m.RData')
load('data/models/paramsxg_ep_val_5d_5m.RData')
load('data/models/paramsxg_ep_val_5d_3m.RData')
load('data/models/paramsxg_ep_val_5d_51m.RData')
load('data/models/paramsxg_ep_val_5d_71m.RData')
load('data/models/paramsxg_ep_val_jd_5m.RData')
load('data/models/paramsxg_ep_val_jd_3m.RData')
load('data/models/paramsxg_ep_val_jd_51m.RData')
load('data/models/paramsxg_ep_val_jd_71m.RData')
Now that these are loaded, we need to look at the test/train statistics. Ideally the train/test RMSE are relatively close so we don’t choose too overfit of a model. Below, we apply the model to the validation dataset and plot the validation observed versus predicted.
The optimized booster was chosen as the highest-ranked booster where the train/test were within 0.15m RMSE. If no booster met that, the booster with the closest train/test RMSE was chosen.
mod_summary_3d_3m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[12]\ttrain-rmse:0.335284\tval-rmse:1.087013"
FALSE 2 "[7]\ttrain-rmse:0.526785\tval-rmse:1.089217"
FALSE 3 "[7]\ttrain-rmse:0.531340\tval-rmse:1.093105"
FALSE 4 "[7]\ttrain-rmse:0.602711\tval-rmse:1.097848"
FALSE 5 "[8]\ttrain-rmse:0.558938\tval-rmse:1.102172"
FALSE 6 "[8]\ttrain-rmse:0.586042\tval-rmse:1.103532"
FALSE 7 "[7]\ttrain-rmse:0.591689\tval-rmse:1.118336"
FALSE 8 "[28]\ttrain-rmse:0.503954\tval-rmse:1.119174"
FALSE 9 "[9]\ttrain-rmse:0.569599\tval-rmse:1.125072"
FALSE 10 "[37]\ttrain-rmse:0.431969\tval-rmse:1.127650"
FALSE 11 "[24]\ttrain-rmse:0.415461\tval-rmse:1.130614"
FALSE 12 "[32]\ttrain-rmse:0.342518\tval-rmse:1.131802"
FALSE 13 "[11]\ttrain-rmse:0.451776\tval-rmse:1.132324"
FALSE 14 "[9]\ttrain-rmse:0.338201\tval-rmse:1.133378"
FALSE 15 "[7]\ttrain-rmse:0.589520\tval-rmse:1.135986"
FALSE 16 "[22]\ttrain-rmse:0.584090\tval-rmse:1.136325"
FALSE 17 "[7]\ttrain-rmse:0.598033\tval-rmse:1.136973"
FALSE 18 "[8]\ttrain-rmse:0.434725\tval-rmse:1.137252"
FALSE 19 "[9]\ttrain-rmse:0.430390\tval-rmse:1.141726"
FALSE 20 "[12]\ttrain-rmse:0.428501\tval-rmse:1.142100"
mod_summary_3d_5m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[9]\ttrain-rmse:0.463925\tval-rmse:1.128587"
FALSE 2 "[9]\ttrain-rmse:0.525839\tval-rmse:1.142571"
FALSE 3 "[8]\ttrain-rmse:0.495013\tval-rmse:1.158728"
FALSE 4 "[7]\ttrain-rmse:0.620163\tval-rmse:1.160755"
FALSE 5 "[10]\ttrain-rmse:0.437453\tval-rmse:1.169238"
FALSE 6 "[6]\ttrain-rmse:0.725334\tval-rmse:1.200457"
FALSE 7 "[7]\ttrain-rmse:0.507380\tval-rmse:1.202151"
FALSE 8 "[12]\ttrain-rmse:0.500341\tval-rmse:1.203530"
FALSE 9 "[6]\ttrain-rmse:0.644412\tval-rmse:1.209681"
FALSE 10 "[28]\ttrain-rmse:0.528243\tval-rmse:1.213825"
FALSE 11 "[15]\ttrain-rmse:0.317623\tval-rmse:1.216757"
FALSE 12 "[10]\ttrain-rmse:0.452709\tval-rmse:1.217715"
FALSE 13 "[7]\ttrain-rmse:0.579336\tval-rmse:1.218872"
FALSE 14 "[30]\ttrain-rmse:0.385488\tval-rmse:1.219306"
FALSE 15 "[8]\ttrain-rmse:0.548343\tval-rmse:1.219763"
FALSE 16 "[32]\ttrain-rmse:0.251207\tval-rmse:1.221552"
FALSE 17 "[34]\ttrain-rmse:0.426312\tval-rmse:1.222602"
FALSE 18 "[25]\ttrain-rmse:0.533884\tval-rmse:1.234227"
FALSE 19 "[25]\ttrain-rmse:0.571146\tval-rmse:1.234590"
FALSE 20 "[28]\ttrain-rmse:0.430349\tval-rmse:1.235237"
mod_summary_3d_51m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[9]\ttrain-rmse:0.482756\tval-rmse:1.069052"
FALSE 2 "[10]\ttrain-rmse:0.283959\tval-rmse:1.073175"
FALSE 3 "[28]\ttrain-rmse:0.141480\tval-rmse:1.082309"
FALSE 4 "[28]\ttrain-rmse:0.141667\tval-rmse:1.096474"
FALSE 5 "[7]\ttrain-rmse:0.455231\tval-rmse:1.112479"
FALSE 6 "[9]\ttrain-rmse:0.394357\tval-rmse:1.118956"
FALSE 7 "[26]\ttrain-rmse:0.466996\tval-rmse:1.126278"
FALSE 8 "[35]\ttrain-rmse:0.258252\tval-rmse:1.131860"
FALSE 9 "[10]\ttrain-rmse:0.235587\tval-rmse:1.132392"
FALSE 10 "[30]\ttrain-rmse:0.430396\tval-rmse:1.134458"
FALSE 11 "[11]\ttrain-rmse:0.442577\tval-rmse:1.144442"
FALSE 12 "[37]\ttrain-rmse:0.181998\tval-rmse:1.145793"
FALSE 13 "[11]\ttrain-rmse:0.405025\tval-rmse:1.146676"
FALSE 14 "[9]\ttrain-rmse:0.477978\tval-rmse:1.146866"
FALSE 15 "[8]\ttrain-rmse:0.409388\tval-rmse:1.146959"
FALSE 16 "[43]\ttrain-rmse:0.218057\tval-rmse:1.148687"
FALSE 17 "[7]\ttrain-rmse:0.547289\tval-rmse:1.150965"
FALSE 18 "[28]\ttrain-rmse:0.377968\tval-rmse:1.153367"
FALSE 19 "[36]\ttrain-rmse:0.228446\tval-rmse:1.157564"
FALSE 20 "[34]\ttrain-rmse:0.279642\tval-rmse:1.157663"
mod_summary_3d_71m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[55]\ttrain-rmse:0.153186\tval-rmse:0.998934"
FALSE 2 "[32]\ttrain-rmse:0.276121\tval-rmse:1.092771"
FALSE 3 "[15]\ttrain-rmse:0.370437\tval-rmse:1.095125"
FALSE 4 "[18]\ttrain-rmse:0.143716\tval-rmse:1.101931"
FALSE 5 "[12]\ttrain-rmse:0.250379\tval-rmse:1.103327"
FALSE 6 "[7]\ttrain-rmse:0.515935\tval-rmse:1.106664"
FALSE 7 "[12]\ttrain-rmse:0.376771\tval-rmse:1.107672"
FALSE 8 "[53]\ttrain-rmse:0.096195\tval-rmse:1.108306"
FALSE 9 "[39]\ttrain-rmse:0.182362\tval-rmse:1.109379"
FALSE 10 "[46]\ttrain-rmse:0.051242\tval-rmse:1.112116"
FALSE 11 "[103]\ttrain-rmse:0.055456\tval-rmse:1.119989"
FALSE 12 "[28]\ttrain-rmse:0.379154\tval-rmse:1.126778"
FALSE 13 "[334]\ttrain-rmse:0.001169\tval-rmse:1.129741"
FALSE 14 "[21]\ttrain-rmse:0.255453\tval-rmse:1.133508"
FALSE 15 "[124]\ttrain-rmse:0.050760\tval-rmse:1.137539"
FALSE 16 "[7]\ttrain-rmse:0.576990\tval-rmse:1.138031"
FALSE 17 "[51]\ttrain-rmse:0.062310\tval-rmse:1.142040"
FALSE 18 "[10]\ttrain-rmse:0.502077\tval-rmse:1.142326"
FALSE 19 "[15]\ttrain-rmse:0.330915\tval-rmse:1.144333"
FALSE 20 "[9]\ttrain-rmse:0.404875\tval-rmse:1.145010"
# most of best models are overfit, so looking for train/test RMSE that are closer
optimized_booster_3d_3m <- mod_summary_3d_3m$mod[4][[1]]
optimized_booster_3d_5m <- mod_summary_3d_5m$mod[6][[1]]
optimized_booster_3d_51m <- mod_summary_3d_51m$mod[17][[1]]
optimized_booster_3d_71m <- mod_summary_3d_71m$mod[16][[1]]
#note that pretty much all of these are overfit.
# Apply best mod
preds_3 <- val_3 %>%
mutate(pred_secchi_3d_5m = predict(optimized_booster_3d_5m, dval_3d_5m),
pred_secchi_3d_3m = predict(optimized_booster_3d_3m, dval_3d_3m),
pred_secchi_3d_51m = predict(optimized_booster_3d_51m, dval_3d_51m),
pred_secchi_3d_71m = predict(optimized_booster_3d_71m, dval_3d_71m))
evals_3 <- preds_3 %>%
summarise(across(c(pred_secchi_3d_3m, pred_secchi_3d_5m, pred_secchi_3d_71m, pred_secchi_3d_51m),
list(rmse = ~rmse(secchi, .),
mae = ~mae(secchi, .),
mape = ~mape(secchi, .),
bias = ~bias(secchi, .),
p.bias = ~percent_bias(secchi, .),
smape = ~smape(secchi, .),
r2 = ~cor(secchi, .)^2),
.names = "{fn}_{col}"))
evals_3
FALSE rmse_pred_secchi_3d_3m mae_pred_secchi_3d_3m mape_pred_secchi_3d_3m
FALSE 1 1.2977 0.9105761 0.2724691
FALSE bias_pred_secchi_3d_3m p.bias_pred_secchi_3d_3m smape_pred_secchi_3d_3m
FALSE 1 0.4926344 0.06001908 0.280696
FALSE r2_pred_secchi_3d_3m rmse_pred_secchi_3d_5m mae_pred_secchi_3d_5m
FALSE 1 0.2612316 1.259056 0.8534543
FALSE mape_pred_secchi_3d_5m bias_pred_secchi_3d_5m p.bias_pred_secchi_3d_5m
FALSE 1 0.2373248 0.482596 0.07028512
FALSE smape_pred_secchi_3d_5m r2_pred_secchi_3d_5m rmse_pred_secchi_3d_71m
FALSE 1 0.2517531 0.3093081 1.220405
FALSE mae_pred_secchi_3d_71m mape_pred_secchi_3d_71m bias_pred_secchi_3d_71m
FALSE 1 0.7567457 0.2285944 0.2679509
FALSE p.bias_pred_secchi_3d_71m smape_pred_secchi_3d_71m r2_pred_secchi_3d_71m
FALSE 1 -0.006846514 0.2268041 0.2734262
FALSE rmse_pred_secchi_3d_51m mae_pred_secchi_3d_51m mape_pred_secchi_3d_51m
FALSE 1 1.176872 0.7801184 0.2244117
FALSE bias_pred_secchi_3d_51m p.bias_pred_secchi_3d_51m smape_pred_secchi_3d_51m
FALSE 1 0.4347431 0.05153744 0.2328084
FALSE r2_pred_secchi_3d_51m
FALSE 1 0.4175778
mod_summary_5d_3m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[9]\ttrain-rmse:0.654004\tval-rmse:1.025381"
FALSE 2 "[9]\ttrain-rmse:0.675980\tval-rmse:1.077414"
FALSE 3 "[13]\ttrain-rmse:0.392443\tval-rmse:1.108009"
FALSE 4 "[33]\ttrain-rmse:0.629557\tval-rmse:1.112568"
FALSE 5 "[16]\ttrain-rmse:0.473889\tval-rmse:1.125959"
FALSE 6 "[25]\ttrain-rmse:0.658955\tval-rmse:1.138390"
FALSE 7 "[10]\ttrain-rmse:0.525231\tval-rmse:1.141358"
FALSE 8 "[10]\ttrain-rmse:0.629822\tval-rmse:1.141450"
FALSE 9 "[39]\ttrain-rmse:0.512495\tval-rmse:1.141883"
FALSE 10 "[46]\ttrain-rmse:0.207288\tval-rmse:1.142131"
FALSE 11 "[17]\ttrain-rmse:0.496344\tval-rmse:1.148032"
FALSE 12 "[42]\ttrain-rmse:0.453988\tval-rmse:1.149211"
FALSE 13 "[32]\ttrain-rmse:0.436843\tval-rmse:1.152110"
FALSE 14 "[28]\ttrain-rmse:0.530019\tval-rmse:1.158884"
FALSE 15 "[31]\ttrain-rmse:0.507990\tval-rmse:1.160336"
FALSE 16 "[15]\ttrain-rmse:0.491015\tval-rmse:1.163825"
FALSE 17 "[60]\ttrain-rmse:0.386376\tval-rmse:1.164033"
FALSE 18 "[36]\ttrain-rmse:0.431165\tval-rmse:1.165511"
FALSE 19 "[31]\ttrain-rmse:0.548566\tval-rmse:1.169325"
FALSE 20 "[8]\ttrain-rmse:0.592849\tval-rmse:1.169596"
mod_summary_5d_5m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[16]\ttrain-rmse:0.189603\tval-rmse:1.008704"
FALSE 2 "[8]\ttrain-rmse:0.461622\tval-rmse:1.019533"
FALSE 3 "[7]\ttrain-rmse:0.630247\tval-rmse:1.038831"
FALSE 4 "[13]\ttrain-rmse:0.289668\tval-rmse:1.040078"
FALSE 5 "[10]\ttrain-rmse:0.271140\tval-rmse:1.050976"
FALSE 6 "[33]\ttrain-rmse:0.297841\tval-rmse:1.052385"
FALSE 7 "[6]\ttrain-rmse:0.724049\tval-rmse:1.055014"
FALSE 8 "[24]\ttrain-rmse:0.463819\tval-rmse:1.056837"
FALSE 9 "[23]\ttrain-rmse:0.616203\tval-rmse:1.058228"
FALSE 10 "[26]\ttrain-rmse:0.557614\tval-rmse:1.059505"
FALSE 11 "[12]\ttrain-rmse:0.318032\tval-rmse:1.059781"
FALSE 12 "[29]\ttrain-rmse:0.522020\tval-rmse:1.060922"
FALSE 13 "[18]\ttrain-rmse:0.361752\tval-rmse:1.061626"
FALSE 14 "[10]\ttrain-rmse:0.643946\tval-rmse:1.061922"
FALSE 15 "[9]\ttrain-rmse:0.504441\tval-rmse:1.064648"
FALSE 16 "[25]\ttrain-rmse:0.282102\tval-rmse:1.066199"
FALSE 17 "[10]\ttrain-rmse:0.528801\tval-rmse:1.066358"
FALSE 18 "[13]\ttrain-rmse:0.541832\tval-rmse:1.066795"
FALSE 19 "[12]\ttrain-rmse:0.582742\tval-rmse:1.070749"
FALSE 20 "[29]\ttrain-rmse:0.265237\tval-rmse:1.075284"
mod_summary_5d_51m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[18]\ttrain-rmse:0.156256\tval-rmse:1.031431"
FALSE 2 "[13]\ttrain-rmse:0.474956\tval-rmse:1.072913"
FALSE 3 "[24]\ttrain-rmse:0.250120\tval-rmse:1.077066"
FALSE 4 "[41]\ttrain-rmse:0.040842\tval-rmse:1.087478"
FALSE 5 "[15]\ttrain-rmse:0.269935\tval-rmse:1.092607"
FALSE 6 "[35]\ttrain-rmse:0.198516\tval-rmse:1.099014"
FALSE 7 "[17]\ttrain-rmse:0.437568\tval-rmse:1.101220"
FALSE 8 "[16]\ttrain-rmse:0.340692\tval-rmse:1.104658"
FALSE 9 "[29]\ttrain-rmse:0.365529\tval-rmse:1.105211"
FALSE 10 "[48]\ttrain-rmse:0.366029\tval-rmse:1.106221"
FALSE 11 "[26]\ttrain-rmse:0.332063\tval-rmse:1.108211"
FALSE 12 "[129]\ttrain-rmse:0.167061\tval-rmse:1.109177"
FALSE 13 "[63]\ttrain-rmse:0.253126\tval-rmse:1.110308"
FALSE 14 "[34]\ttrain-rmse:0.512471\tval-rmse:1.110581"
FALSE 15 "[26]\ttrain-rmse:0.400767\tval-rmse:1.114893"
FALSE 16 "[74]\ttrain-rmse:0.122011\tval-rmse:1.117836"
FALSE 17 "[33]\ttrain-rmse:0.227487\tval-rmse:1.118952"
FALSE 18 "[19]\ttrain-rmse:0.413520\tval-rmse:1.119058"
FALSE 19 "[44]\ttrain-rmse:0.447269\tval-rmse:1.122793"
FALSE 20 "[83]\ttrain-rmse:0.087560\tval-rmse:1.123034"
mod_summary_5d_71m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[47]\ttrain-rmse:0.024502\tval-rmse:1.028120"
FALSE 2 "[28]\ttrain-rmse:0.237254\tval-rmse:1.033099"
FALSE 3 "[22]\ttrain-rmse:0.291370\tval-rmse:1.041030"
FALSE 4 "[73]\ttrain-rmse:0.108837\tval-rmse:1.061617"
FALSE 5 "[136]\ttrain-rmse:0.143127\tval-rmse:1.062586"
FALSE 6 "[106]\ttrain-rmse:0.097370\tval-rmse:1.072494"
FALSE 7 "[31]\ttrain-rmse:0.124626\tval-rmse:1.082026"
FALSE 8 "[124]\ttrain-rmse:0.264698\tval-rmse:1.084581"
FALSE 9 "[34]\ttrain-rmse:0.345487\tval-rmse:1.085205"
FALSE 10 "[61]\ttrain-rmse:0.365072\tval-rmse:1.090076"
FALSE 11 "[43]\ttrain-rmse:0.070420\tval-rmse:1.091248"
FALSE 12 "[168]\ttrain-rmse:0.041307\tval-rmse:1.094215"
FALSE 13 "[11]\ttrain-rmse:0.430494\tval-rmse:1.098084"
FALSE 14 "[143]\ttrain-rmse:0.185685\tval-rmse:1.099572"
FALSE 15 "[67]\ttrain-rmse:0.164484\tval-rmse:1.101121"
FALSE 16 "[192]\ttrain-rmse:0.091564\tval-rmse:1.102078"
FALSE 17 "[28]\ttrain-rmse:0.323367\tval-rmse:1.103658"
FALSE 18 "[21]\ttrain-rmse:0.123058\tval-rmse:1.110751"
FALSE 19 "[64]\ttrain-rmse:0.119611\tval-rmse:1.113335"
FALSE 20 "[75]\ttrain-rmse:0.246888\tval-rmse:1.114426"
# most of best models are overfit, so looking for train/test RMSE that are closer
optimized_booster_5d_3m <- mod_summary_5d_3m$mod[2][[1]]
optimized_booster_5d_5m <- mod_summary_5d_5m$mod[7][[1]]
optimized_booster_5d_51m <- mod_summary_5d_51m$mod[14][[1]]
optimized_booster_5d_71m <- mod_summary_5d_71m$mod[13][[1]]
# none of the train/test rmse's are particularly close
# Apply best mod
preds_5 <- val_5 %>%
mutate(pred_secchi_5d_5m = predict(optimized_booster_5d_5m, dval_5d_5m),
pred_secchi_5d_3m = predict(optimized_booster_5d_3m, dval_5d_3m),
pred_secchi_5d_51m = predict(optimized_booster_5d_51m, dval_5d_51m),
pred_secchi_5d_71m = predict(optimized_booster_5d_71m, dval_5d_71m))
evals_5 <- preds_5 %>%
summarise(across(c(pred_secchi_5d_3m, pred_secchi_5d_5m, pred_secchi_5d_71m,pred_secchi_5d_51m),
list(rmse = ~rmse(secchi, .),
mae = ~mae(secchi, .),
mape = ~mape(secchi, .),
bias = ~bias(secchi, .),
p.bias = ~percent_bias(secchi, .),
smape = ~smape(secchi, .),
r2 = ~cor(secchi, .)^2),
.names = "{fn}_{col}"))
evals_5
FALSE rmse_pred_secchi_5d_3m mae_pred_secchi_5d_3m mape_pred_secchi_5d_3m
FALSE 1 0.9119791 0.7156508 0.2847216
FALSE bias_pred_secchi_5d_3m p.bias_pred_secchi_5d_3m smape_pred_secchi_5d_3m
FALSE 1 -0.2255441 -0.1753277 0.2466742
FALSE r2_pred_secchi_5d_3m rmse_pred_secchi_5d_5m mae_pred_secchi_5d_5m
FALSE 1 0.4971265 0.9212489 0.7305891
FALSE mape_pred_secchi_5d_5m bias_pred_secchi_5d_5m p.bias_pred_secchi_5d_5m
FALSE 1 0.2656387 0.07476309 -0.08659374
FALSE smape_pred_secchi_5d_5m r2_pred_secchi_5d_5m rmse_pred_secchi_5d_71m
FALSE 1 0.2478393 0.5146847 0.8785996
FALSE mae_pred_secchi_5d_71m mape_pred_secchi_5d_71m bias_pred_secchi_5d_71m
FALSE 1 0.7335627 0.2925701 -0.2129254
FALSE p.bias_pred_secchi_5d_71m smape_pred_secchi_5d_71m r2_pred_secchi_5d_71m
FALSE 1 -0.1689127 0.259052 0.5317007
FALSE rmse_pred_secchi_5d_51m mae_pred_secchi_5d_51m mape_pred_secchi_5d_51m
FALSE 1 0.8976901 0.7348639 0.2927805
FALSE bias_pred_secchi_5d_51m p.bias_pred_secchi_5d_51m smape_pred_secchi_5d_51m
FALSE 1 -0.2467542 -0.1804061 0.2564741
FALSE r2_pred_secchi_5d_51m
FALSE 1 0.520718
mod_summary_jd_3m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[31]\ttrain-rmse:0.271855\tval-rmse:0.736526"
FALSE 2 "[30]\ttrain-rmse:0.331277\tval-rmse:0.740514"
FALSE 3 "[6]\ttrain-rmse:0.636894\tval-rmse:0.747420"
FALSE 4 "[18]\ttrain-rmse:0.462676\tval-rmse:0.750624"
FALSE 5 "[9]\ttrain-rmse:0.487818\tval-rmse:0.751262"
FALSE 6 "[39]\ttrain-rmse:0.201864\tval-rmse:0.753978"
FALSE 7 "[30]\ttrain-rmse:0.350368\tval-rmse:0.760111"
FALSE 8 "[31]\ttrain-rmse:0.283021\tval-rmse:0.760705"
FALSE 9 "[26]\ttrain-rmse:0.496021\tval-rmse:0.761479"
FALSE 10 "[16]\ttrain-rmse:0.332122\tval-rmse:0.762778"
FALSE 11 "[29]\ttrain-rmse:0.528990\tval-rmse:0.770757"
FALSE 12 "[6]\ttrain-rmse:0.719588\tval-rmse:0.771342"
FALSE 13 "[12]\ttrain-rmse:0.414653\tval-rmse:0.772027"
FALSE 14 "[47]\ttrain-rmse:0.283117\tval-rmse:0.773023"
FALSE 15 "[25]\ttrain-rmse:0.473404\tval-rmse:0.774132"
FALSE 16 "[35]\ttrain-rmse:0.540308\tval-rmse:0.776295"
FALSE 17 "[33]\ttrain-rmse:0.409611\tval-rmse:0.783011"
FALSE 18 "[28]\ttrain-rmse:0.388506\tval-rmse:0.786164"
FALSE 19 "[10]\ttrain-rmse:0.437183\tval-rmse:0.787884"
FALSE 20 "[7]\ttrain-rmse:0.465189\tval-rmse:0.789258"
mod_summary_jd_5m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[6]\ttrain-rmse:0.736923\tval-rmse:0.756968"
FALSE 2 "[23]\ttrain-rmse:0.603866\tval-rmse:0.758037"
FALSE 3 "[9]\ttrain-rmse:0.453114\tval-rmse:0.765366"
FALSE 4 "[24]\ttrain-rmse:0.453520\tval-rmse:0.772487"
FALSE 5 "[7]\ttrain-rmse:0.663391\tval-rmse:0.773507"
FALSE 6 "[27]\ttrain-rmse:0.429082\tval-rmse:0.775678"
FALSE 7 "[23]\ttrain-rmse:0.603928\tval-rmse:0.778445"
FALSE 8 "[31]\ttrain-rmse:0.435165\tval-rmse:0.784245"
FALSE 9 "[20]\ttrain-rmse:0.612635\tval-rmse:0.784554"
FALSE 10 "[19]\ttrain-rmse:0.690123\tval-rmse:0.785796"
FALSE 11 "[6]\ttrain-rmse:0.670781\tval-rmse:0.786398"
FALSE 12 "[19]\ttrain-rmse:0.663948\tval-rmse:0.787706"
FALSE 13 "[20]\ttrain-rmse:0.683132\tval-rmse:0.789375"
FALSE 14 "[6]\ttrain-rmse:0.776569\tval-rmse:0.790957"
FALSE 15 "[10]\ttrain-rmse:0.309616\tval-rmse:0.791964"
FALSE 16 "[10]\ttrain-rmse:0.358972\tval-rmse:0.794892"
FALSE 17 "[8]\ttrain-rmse:0.485569\tval-rmse:0.797981"
FALSE 18 "[22]\ttrain-rmse:0.626119\tval-rmse:0.801180"
FALSE 19 "[28]\ttrain-rmse:0.505855\tval-rmse:0.801938"
FALSE 20 "[20]\ttrain-rmse:0.681854\tval-rmse:0.802170"
mod_summary_jd_51m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[23]\ttrain-rmse:0.510618\tval-rmse:0.710481"
FALSE 2 "[5]\ttrain-rmse:0.964692\tval-rmse:0.712624"
FALSE 3 "[22]\ttrain-rmse:0.303457\tval-rmse:0.719948"
FALSE 4 "[7]\ttrain-rmse:0.511281\tval-rmse:0.720577"
FALSE 5 "[8]\ttrain-rmse:0.546478\tval-rmse:0.735050"
FALSE 6 "[12]\ttrain-rmse:0.183252\tval-rmse:0.740674"
FALSE 7 "[22]\ttrain-rmse:0.549096\tval-rmse:0.745440"
FALSE 8 "[23]\ttrain-rmse:0.683340\tval-rmse:0.746011"
FALSE 9 "[8]\ttrain-rmse:0.520877\tval-rmse:0.747215"
FALSE 10 "[23]\ttrain-rmse:0.463334\tval-rmse:0.754520"
FALSE 11 "[21]\ttrain-rmse:0.682103\tval-rmse:0.755574"
FALSE 12 "[9]\ttrain-rmse:0.313900\tval-rmse:0.756729"
FALSE 13 "[21]\ttrain-rmse:0.590486\tval-rmse:0.761808"
FALSE 14 "[7]\ttrain-rmse:0.751783\tval-rmse:0.762221"
FALSE 15 "[8]\ttrain-rmse:0.529460\tval-rmse:0.762431"
FALSE 16 "[22]\ttrain-rmse:0.522601\tval-rmse:0.763434"
FALSE 17 "[9]\ttrain-rmse:0.370283\tval-rmse:0.763454"
FALSE 18 "[22]\ttrain-rmse:0.590674\tval-rmse:0.768587"
FALSE 19 "[10]\ttrain-rmse:0.491277\tval-rmse:0.768924"
FALSE 20 "[6]\ttrain-rmse:0.594544\tval-rmse:0.771888"
mod_summary_jd_71m %>% select(best_message)
FALSE # A tibble: 20 × 1
FALSE best_message
FALSE <chr>
FALSE 1 "[12]\ttrain-rmse:0.184131\tval-rmse:0.727265"
FALSE 2 "[28]\ttrain-rmse:0.380783\tval-rmse:0.736337"
FALSE 3 "[25]\ttrain-rmse:0.513493\tval-rmse:0.750214"
FALSE 4 "[10]\ttrain-rmse:0.381673\tval-rmse:0.750951"
FALSE 5 "[8]\ttrain-rmse:0.359446\tval-rmse:0.764039"
FALSE 6 "[22]\ttrain-rmse:0.645680\tval-rmse:0.765264"
FALSE 7 "[12]\ttrain-rmse:0.263862\tval-rmse:0.767008"
FALSE 8 "[7]\ttrain-rmse:0.456479\tval-rmse:0.770218"
FALSE 9 "[6]\ttrain-rmse:0.672270\tval-rmse:0.772328"
FALSE 10 "[7]\ttrain-rmse:0.625718\tval-rmse:0.773075"
FALSE 11 "[27]\ttrain-rmse:0.394426\tval-rmse:0.773133"
FALSE 12 "[26]\ttrain-rmse:0.518413\tval-rmse:0.776593"
FALSE 13 "[23]\ttrain-rmse:0.655277\tval-rmse:0.778425"
FALSE 14 "[25]\ttrain-rmse:0.447504\tval-rmse:0.779710"
FALSE 15 "[36]\ttrain-rmse:0.226403\tval-rmse:0.782267"
FALSE 16 "[8]\ttrain-rmse:0.435856\tval-rmse:0.782729"
FALSE 17 "[10]\ttrain-rmse:0.280211\tval-rmse:0.783787"
FALSE 18 "[24]\ttrain-rmse:0.599641\tval-rmse:0.784227"
FALSE 19 "[6]\ttrain-rmse:0.590968\tval-rmse:0.785543"
FALSE 20 "[22]\ttrain-rmse:0.636012\tval-rmse:0.785663"
optimized_booster_jd_3m <- mod_summary_jd_3m$mod[3][[1]]
optimized_booster_jd_5m <- mod_summary_jd_5m$mod[1][[1]]
optimized_booster_jd_51m <- mod_summary_jd_51m$mod[14][[1]]
optimized_booster_jd_71m <- mod_summary_jd_71m$mod[9][[1]]
# Apply best mod
preds_jd <- val_j %>%
mutate(pred_secchi_jd_5m = predict(optimized_booster_jd_5m, dval_jd_5m),
pred_secchi_jd_3m = predict(optimized_booster_jd_3m, dval_jd_3m),
pred_secchi_jd_51m = predict(optimized_booster_jd_51m, dval_jd_51m),
pred_secchi_jd_71m = predict(optimized_booster_jd_71m, dval_jd_71m))
evals_jd <- preds_jd %>%
summarise(across(c(pred_secchi_jd_3m, pred_secchi_jd_5m, pred_secchi_jd_71m, pred_secchi_jd_51m),
list(rmse = ~rmse(secchi, .),
mae = ~mae(secchi, .),
mape = ~mape(secchi, .),
bias = ~bias(secchi, .),
p.bias = ~percent_bias(secchi, .),
smape = ~smape(secchi, .),
r2 = ~cor(secchi, .)^2),
.names = "{fn}_{col}"))
evals_jd
FALSE rmse_pred_secchi_jd_3m mae_pred_secchi_jd_3m mape_pred_secchi_jd_3m
FALSE 1 1.184459 0.7835531 0.2605014
FALSE bias_pred_secchi_jd_3m p.bias_pred_secchi_jd_3m smape_pred_secchi_jd_3m
FALSE 1 0.1094772 -0.06518408 0.2415055
FALSE r2_pred_secchi_jd_3m rmse_pred_secchi_jd_5m mae_pred_secchi_jd_5m
FALSE 1 0.1602012 1.14423 0.8223011
FALSE mape_pred_secchi_jd_5m bias_pred_secchi_jd_5m p.bias_pred_secchi_jd_5m
FALSE 1 0.2855643 0.1022169 -0.07604451
FALSE smape_pred_secchi_jd_5m r2_pred_secchi_jd_5m rmse_pred_secchi_jd_71m
FALSE 1 0.2580702 0.2423282 1.203813
FALSE mae_pred_secchi_jd_71m mape_pred_secchi_jd_71m bias_pred_secchi_jd_71m
FALSE 1 0.7648194 0.2245034 0.3801764
FALSE p.bias_pred_secchi_jd_71m smape_pred_secchi_jd_71m r2_pred_secchi_jd_71m
FALSE 1 0.03477333 0.2328292 0.2665007
FALSE rmse_pred_secchi_jd_51m mae_pred_secchi_jd_51m mape_pred_secchi_jd_51m
FALSE 1 1.127621 0.7438275 0.2439819
FALSE bias_pred_secchi_jd_51m p.bias_pred_secchi_jd_51m smape_pred_secchi_jd_51m
FALSE 1 -0.01828638 -0.1034457 0.2252636
FALSE r2_pred_secchi_jd_51m
FALSE 1 0.2675877
Keep in mind that all of these models seemed overfit in the train/test sets.
These all perform pretty poorly.
Keep in mind that all of these models seemed overfit in the train/test sets.
These look particularly poor at the higher secchi values.
The correlations here are quite strong and linear, but they are all pretty flat and will definitely over predict low Secchi and under predict high Secchi.
Choosing a ‘best performing’ model for these results is difficult. Because the predicted range is so small in the local knowledge set, I’m going to dismiss those from consideration. The 3-day window did not perform well by RMSE, but ‘looked’ better than the 5-day window. With some reservation, I’m going to apply the 5-day window with 7/1-day met summaries here as the best model from this batch. This could easily be swapped with the three-day window and 5/1 day met summary.
features = band_met71_feats
model = optimized_booster_5d_71m
met = '7/1 day met summaries'
window = '5 day window'
full_stack <- read_csv('data/upstreamRS/yojoa_corr_rrs_met_scaled_v2023-06-15.csv') %>%
mutate(secchi = 100) %>%
prepData(.)
stack_xgb <- xgb.DMatrix(data = as.matrix(full_stack[,features]))
full_stack_simp <- full_stack %>%
mutate(secchi = predict(model, stack_xgb)) %>%
select(date, location, secchi, mission)
situ_stack <- read_csv('data/in-situ/Secchi_completedataset.csv') %>%
mutate(secchi = as.numeric(secchi),
date = mdy(date)) %>%
filter(!is.na(secchi)) %>%
mutate(mission = 'Measured') %>%
bind_rows(full_stack_simp)%>%
mutate(location = gsub(' ', '', location))
Let’s look at each of the site records alongside the Landsat-estimated Secchi depth.
FALSE [[1]]
FALSE
FALSE [[2]]
FALSE
FALSE [[3]]
FALSE
FALSE [[4]]
FALSE
FALSE [[5]]
FALSE
FALSE [[6]]
FALSE
FALSE [[7]]
FALSE
FALSE [[8]]
FALSE
FALSE [[9]]
FALSE
FALSE [[10]]
FALSE
FALSE [[11]]
FALSE
FALSE [[12]]
FALSE
FALSE [[13]]
FALSE
FALSE [[14]]
FALSE
FALSE [[15]]
FALSE
FALSE [[16]]
FALSE
FALSE [[17]]
FALSE
FALSE [[18]]
plotRecentBySite = function(site) {
ggplot(situ_stack %>%
filter(location == site), aes(x = date, y = secchi, color = mission,
shape = mission)) +
geom_point() +
labs(title = paste0('Yojoa Secchi 2018-2022 - Site ', site),
subtitle = paste0(window, ', ', met, ', high data stringency'),
y = 'Secchi (m)',
color = 'data source', shape = 'data source') +
scale_color_manual(values = c('grey10','grey30','grey50','grey70','blue')) +
theme_few() +
theme(legend.position = c(0.8,0.8)) +
scale_shape_manual(values = c(19,19,19,19,1)) +
scale_y_continuous(limits = c(0, max(situ_stack$secchi)), breaks = seq(0, max(situ_stack$secchi), 2)) +
scale_x_date(limits = c(ymd('2018-01-01'), max(situ_stack$date))) +
theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
plot.subtitle = element_text(hjust = 0.5))
}
map(sort(unique(situ_stack$location)), plotRecentBySite)
FALSE [[1]]
FALSE
FALSE [[2]]
FALSE
FALSE [[3]]
FALSE
FALSE [[4]]
FALSE
FALSE [[5]]
FALSE
FALSE [[6]]
FALSE
FALSE [[7]]
FALSE
FALSE [[8]]
FALSE
FALSE [[9]]
FALSE
FALSE [[10]]
FALSE
FALSE [[11]]
FALSE
FALSE [[12]]
FALSE
FALSE [[13]]
FALSE
FALSE [[14]]
FALSE
FALSE [[15]]
FALSE
FALSE [[16]]
FALSE
FALSE [[17]]
FALSE
FALSE [[18]]
While there is plenty of variability across the lake, let’s summarize to a single value per date, since not all sites have the same density of record. Since there are a few oddballs in here (both in terms of measured and estimated), we’ll use the median Secchi across all sites.
lake_med <- situ_stack %>%
group_by(date,mission) %>%
summarize(across(where(is.numeric),median))
ggplot(lake_med, aes(x = date, y = secchi, color = mission, shape = mission)) +
geom_point() +
scale_color_manual(values = c('grey10','grey30','grey50','grey70','blue')) +
theme_few() +
labs(title = 'Yojoa Secchi 2018-2022\nwhole-lake median',
subtitle = paste0(window, ', ', met, ', high data stringency'),
y = 'median Secchi (m)',
color = 'data source', shape = 'data source') +
theme(legend.position = c(0.8,0.8)) +
scale_shape_manual(values = c(19,19,19,19,1)) +
scale_x_date(limits = c(as.Date('2018-01-01'), as.Date('2023-01-01')))+
theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
plot.subtitle = element_text(hjust = 0.5))
feat_importance = xgb.importance(feature_names = band_met51_feats,
model = optimized_booster_jd_51m)
xgb.plot.importance(feat_importance)